import pandas as pd
import numpy as np
import random

from config import *
random.seed(178006)

print('-------- GETTING SUBSCRIBER DATA ----------------------')
print()

# import subscriber data
dt = {'subscriberid': object, 'personkey': object, 'age': str}
df = pd.read_csv(SUBSCRIBER_PATH, dtype=dt)

print(f'{len(df)} observations')
print('keeping only individual bronze/silver/gold subscribers')
df = df[(df.markettype == 'Individual') & (df.metal.isin([2, 3, 4]))]
print(f'now {len(df)} observations')

df['id'] = np.where(df.personkey.isna(), 'sub:' + df.subscriberid, df.personkey)
df = df.groupby(['id', 'year', 'nummonthsmode']).first().reset_index()

# parse bin labels into lists of "cuts" between bins
age_bins = df[~df[VAR_AGE].isna()][[VAR_AGE, VAR_AGE + '_label']].drop_duplicates().sort_values(VAR_AGE)
inc_bins = df[~df[VAR_INC].isna()][[VAR_INC, VAR_INC + '_label']].drop_duplicates().sort_values(VAR_INC)
acg_bins = df[~df[VAR_ACG].isna()][[VAR_ACG, VAR_ACG + '_label']].drop_duplicates().sort_values(VAR_ACG)

age_cuts = age_bins[VAR_AGE + '_label'].str[1:-1].str.split(',').str[1].reset_index(drop=True)[:-1]
inc_cuts = inc_bins[VAR_INC + '_label'].str[1:-1].str.split(',').str[1].reset_index(drop=True)[:-1]
acg_cuts = acg_bins[VAR_ACG + '_label'].str[1:-1].str.split(',').str[1].reset_index(drop=True)[:-1]

age_cuts.to_pickle(CUTS_AGE)
inc_cuts.to_pickle(CUTS_INC)
acg_cuts.to_pickle(CUTS_ACG)

print('age cuts:', list(age_cuts.values))
print('inc cuts:', list(inc_cuts.values))
print('acg cuts:', list(acg_cuts.values))

# get subscriber groups and save
df = df.rename({'constructed_plan_year': 'chosen_plan_id'}, axis=1)

df = df[['subscriberid',
         'personkey',
         'age',
         'sum_concurrent_risk',
         'chosen_plan_id',
         'group_id']]

df.to_csv('../../data/1_subscribers.csv', index=False)
